Identification of MHBs in cancers

Figure 1. Identification of MHBs in cancers. (A) Cancer types included in this study. Abbreviations and numbers of sequenced WGBS samples were indicated. (B) Representative MHB in colon cancer (chr1:1,067,442-1,068,291). Top: DNA methylation status of individual fragments (black: methylated CpGs; white: unmethylated CpGs). Middle: Mean signed LD R² between adjacent CpGs. Bottom: Pairwise signed LD R² between CpG sites. (C) Number of MHBs identified in each cancer type. (D) A Pie chart illustrates the proportion of MHBs annotated to promoter, exonic, intronic, and intergenic regions. (E) Assignment of MHBs into 16 non-overlapping clusters,including data from in-house samples, fresh frozen tissues (GSE230193), and public tumor datasets. (F) Cancer-specific MHB cluster enrichment in corresponding normal tissues. Enrichment analysis performed using LOLA package with the union of MHBs as background. Black rectangles indicate top-ranked significant enrichment (P < 0.05); grey indicates non-significant results.

loading required packages:

Identification of MHBs

The DNA Methylation haplotype blocks (MHBs) were identified by mHapSuite.

# example
java -jar mHapSuite-2.0-jar-with-dependencies.jar MHBDiscovery \
                                  -mhapPath BRCA.mhap.gz \
                                  -cpgPath hg19_CpG.gz \
                                  -tag MHB_BRCA  \
                                  -outputDir MHB/tumor

Figure 1C

# The number of MHBs across 11 cancers
cancer_type = list.files("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig1/MHB/tumor/",pattern=".bed")
counts <- lapply(cancer_type,function(x) {
            fread(paste0("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig1/MHB/tumor/",x),sep="\t") %>% nrow()})
names(counts) <- cancer_type %>% str_split_i("[_.]",i=2)
Ncount=as.data.frame(do.call(rbind,counts)) %>% rownames_to_column(var="Type")

p<-ggplot(Ncount,aes(Type,V1)) + 
    geom_bar(stat="identity",fill="steelblue",width=0.8)+
    scale_y_continuous(expand=c(0,0),limits=c(0,30000))+
    labs(y="The Number of MHBs")+
    theme_bw()+
    theme(panel.grid= element_blank(),
          panel.background=element_blank(),
          panel.border=element_blank(),
          axis.line=element_line(color="black"),
          axis.title.x=element_blank(),
          axis.text.x = element_text(color="black",size=10, angle=45,vjust=1,hjust=1),
          axis.text.y = element_text(color="black",size=10),
          axis.ticks = element_line(color="black"),
          axis.ticks.length = unit(5, "pt") )+
    geom_text(aes(label=V1), vjust=-0.3, size=3.5,color="black")
print(p)

Figure 1D

# MHB annotation in genomic regions
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
mhb_clusters_Anno <- annotatePeak("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig1/MHB/MHB_11_cancers_clusters.bed.gz",
                                  tssRegion=c(-2000, 2000),
                                  TxDb=txdb, 
                                  annoDb="org.Hs.eg.db")
>> loading peak file...              2025-05-17 23时01分56秒 
>> preparing features information...         2025-05-17 23时01分57秒 
>> identifying nearest features...       2025-05-17 23时01分57秒 
>> calculating distance from peak to TSS...  2025-05-17 23时01分58秒 
>> assigning genomic annotation...       2025-05-17 23时01分58秒 
>> adding gene annotation...             2025-05-17 23时02分00秒 
>> assigning chromosome lengths          2025-05-17 23时02分00秒 
>> done...                   2025-05-17 23时02分00秒 
plotAnnoPie(mhb_clusters_Anno)

Figure 1E

# MHB altas
mhb_CT_matrix = read.table("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig1/MHB/MHB_11_cancers_CT_matrix.txt.gz",sep = "\t",header=T)
names(mhb_CT_matrix) = gsub("MHB_","",names(mhb_CT_matrix))
mhb_CT_matrix = mhb_CT_matrix[c(1:6,11:31,8:10,7)]
mhb_CT_matrix=mhb_CT_matrix[order(mhb_CT_matrix$num,mhb_CT_matrix$cluster,decreasing=F),]  
# transform 1-0 matrix
mhb_CT_matrix[-c(1:6)][mhb_CT_matrix[-c(1:6)]>=1]<-1
rownames(mhb_CT_matrix) = paste(mhb_CT_matrix$chr,mhb_CT_matrix$start,mhb_CT_matrix$end,sep="_")
# Ordering function
order_f <-function(x,Tag,cluster){
    num = table(x[Tag][x$cluster==paste0(cluster,"_cluster"),])
    x[Tag][x$cluster==paste0(cluster,"_cluster"),]<-c(rep(0,num["0"]),rep(1,num["1"]))
    data = x
  return(data)
}

# Public GEO data : CRC_P_RRBS ESCC_P LIHC_P
mhb_CT_matrix = order_f(mhb_CT_matrix,"CRC_P","COAD")
mhb_CT_matrix = order_f(mhb_CT_matrix,"ESCC_P","ESCA")
mhb_CT_matrix = order_f(mhb_CT_matrix,"LIHC_P","LIHC")

# Our previous data : Cancer of Unknown Primary data (CUP)
cup<-c("BRCA","CESC","COADREAD","ESCA","HNSC","LIHC","LUSCLUAD","OV","STAD","THCA")
for (tag in cup) {
   if (tag == "COADREAD" ) {
         flag <- "COAD"
   }else if (tag == "LUSCLUAD") {
         flag <- "NSCLC"
   }else{
         flag <- tag
   }
     mhb_CT_matrix = order_f(mhb_CT_matrix,paste0("CUP_",tag),flag)
}

# MHB Map plot
annotation_row = mhb_CT_matrix[c(6)]
annotation_col = data.frame(Assay = factor(c(rep("WGBS",11),rep("RRBS",11),rep("WGBS",3))), 
                                          Tissue_Type = factor(c(rep("Tumor",24),rep("Normal",1))))
rownames(annotation_col) = names(mhb_CT_matrix[-c(1:6)])
annotation_colors = list(Assay=c(WGBS="#4DAF4A",RRBS="#984EA3"),
                        Tissue_Type=c(Tumor="#E41A1C",Normal="#377EB8"),
                        cluster=c(BRCA_cluster="#A6CEE3",CESC_cluster="#1F78B4",
                        COAD_cluster="#B2DF8A",ESCA_cluster="#FDBF6F",
                        HNSC_cluster="#33A02C",LIHC_cluster="#FB9A99",
                        NSCLC_cluster="#E31A1C",OV_cluster="#FF7F00",
                        PACA_cluster="#CAB2D6", STAD_cluster="#6A3D9A",
                        THCA_cluster="#B15928",cluster12="#1B9E77",
                        cluster13="#D95F02",cluster14="#7570B3",
                        cluster15="#E7298A",cluster16="#66A61E"))
data = as.data.frame(mhb_CT_matrix[-c(1:6)])

p <- pheatmap(data,show_rownames=F,cluster_cols= F,cluster_row=F,
                annotation_legend=T,legend=T,
                annotation_row=annotation_row,annotation_col=annotation_col,
                annotation_colors=annotation_colors,
                annotation_names_row=T,border = F,border_color = F,
                gaps_col = c(11,21,24),scale = "none",angle_col = 90,
                color = colorRampPalette(colors = c("white","red"))(1000))
print(p)

Figure 1F

# Cancer MHBs enrichment in Normal MHBs
mhb_T="/sibcb1/bioinformatics/dataupload/cancermhaps/Fig1/MHB/mhb_clusters/"
background="/sibcb1/bioinformatics/dataupload/cancermhaps/Fig1/MHB/MHB_11_cancers_clusters.bed.gz"
files = list.files(mhb_T)
locResults <- lapply(files,function(i){
            # Read input
            mhb=readBed(paste0(mhb_T,i))
            # Universe_Sets Background
            universe_set=fread(background) %>% toGRanges()
            # Overlapped normal MHB
            states=loadRegionDB("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig1/MHB/LOLA_Normal_DB")
            # Runing LOLA
            runLOLA(mhb, universe_set, states, cores=10)
            })
names(locResults) <- files

data <- lapply(files,function(x){ locResults[[x]][,c("filename","pValueLog")] %>% mutate(Type=x)})
data = as.data.frame(do.call(rbind,data))
dta = dcast(data,Type~filename,value.var = "pValueLog")
rownames(dta) = dta$Type;dta$Type=NULL
dta = dta[c("BRCA_cluster.bed","CESC_cluster.bed","COAD_cluster.bed",
            "ESCA_cluster.bed","HNSC_cluster.bed","LIHC_cluster.bed",
            "NSCLC_cluster.bed","OV_cluster.bed","PACA_cluster.bed",
            "STAD_cluster.bed", "THCA_cluster.bed","cluster12","cluster13",
            "cluster14","cluster15","cluster16"),]

# filter P value < 0.05
dta[dta< -log10(0.05)]<-NA

# enrichemnt ranking
rank_t = function(x) {
      m = rank(array(-x),na.last="keep",ties.method = "min")
      return(m)
  }
data_rank= as.data.frame(t(apply(dta,1,rank_t)))
names(data_rank) = names(dta)
tag<-names(data_rank)[-c(9)]
data_rank<- data_rank[c(tag,"MHB_placenta_normal.bed")]

# rename
rownames(data_rank) <- gsub(".bed.gz","",rownames(data_rank))
names(data_rank) <- gsub(".bed","",names(data_rank))

# The enrichment rank of Cancer MHBs vs. Normal MHBs 
p<- pheatmap(data_rank,
                main="Cancer MHBs vs. Normal MHBs",
                show_colnames=T,show_rownames=T,
                cluster_rows=F,cluster_cols=F,
                scale="none",angle_col="90",
                color=colorRampPalette(colors = c("firebrick","white","darkblue"))(1000)
                )
print(p)